# Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Load the dataset
spotify <- read.csv("C:\\spotify_tracks.csv")

# Data cleaning: Remove missing values and select relevant columns
spotify_clean <- spotify %>%
  filter(!is.na(liveness), !is.na(instrumentalness), !is.na(key), !is.na(duration_ms)) %>%
  select(liveness, instrumentalness, key, duration_ms)

# Display structure of cleaned data
str(spotify_clean)
## 'data.frame':    62317 obs. of  4 variables:
##  $ liveness        : num  0.1 0.0951 0.0831 0.124 0.345 0.215 0.178 0.132 0.299 0.114 ...
##  $ instrumentalness: num  5.53e-02 0.00 0.00 7.27e-04 1.35e-06 3.53e-02 1.35e-05 0.00 3.70e-01 1.96e-02 ...
##  $ key             : num  8 10 2 7 7 7 9 4 4 1 ...
##  $ duration_ms     : num  97297 207369 82551 115831 129621 ...
# Interpretation: 
# The dataset contains numeric variables `liveness`, `instrumentalness`, and `duration_ms` 
# and a categorical variable `key` (representing musical key, 0-11).

### 1. Descriptive Statistics ###
# Calculate statistics for liveness
mean_liveness <- mean(spotify_clean$liveness)
median_liveness <- median(spotify_clean$liveness)
mode_liveness <- as.numeric(names(sort(table(spotify_clean$liveness), decreasing = TRUE)[1]))
sd_liveness <- sd(spotify_clean$liveness)
var_liveness <- var(spotify_clean$liveness)

# Calculate statistics for instrumentalness
mean_instrumentalness <- mean(spotify_clean$instrumentalness)
median_instrumentalness <- median(spotify_clean$instrumentalness)
mode_instrumentalness <- as.numeric(names(sort(table(spotify_clean$instrumentalness), decreasing = TRUE)[1]))
sd_instrumentalness <- sd(spotify_clean$instrumentalness)
var_instrumentalness <- var(spotify_clean$instrumentalness)

# Print Descriptive Statistics
cat("Descriptive Statistics for Liveness:\n")
## Descriptive Statistics for Liveness:
cat("Mean:", mean_liveness, "Median:", median_liveness, "Mode:", mode_liveness,
    "Standard Deviation:", sd_liveness, "Variance:", var_liveness, "\n")
## Mean: 0.1941425 Median: 0.125 Mode: 0.11 Standard Deviation: 0.1720304 Variance: 0.02959447
cat("Descriptive Statistics for Instrumentalness:\n")
## Descriptive Statistics for Instrumentalness:
cat("Mean:", mean_instrumentalness, "Median:", median_instrumentalness, "Mode:", mode_instrumentalness,
    "Standard Deviation:", sd_instrumentalness, "Variance:", var_instrumentalness, "\n")
## Mean: 0.1462145 Median: 2.54e-05 Mode: 0 Standard Deviation: 0.3078036 Variance: 0.09474303
# Interpretation:
# The statistics provide a comprehensive summary of the central tendency and dispersion for `liveness` and `instrumentalness`.
# For example, the mean indicates the average level, and the standard deviation shows the variability.

### 2. Inferential Statistics ###

# a. One-Sample T-test for Liveness
# Hypotheses:
# H0: Mean liveness = 0.2
# H1: Mean liveness ≠ 0.2
t_test_one_sample <- t.test(spotify_clean$liveness, mu = 0.2)
print(t_test_one_sample)
## 
##  One Sample t-test
## 
## data:  spotify_clean$liveness
## t = -8.4998, df = 62316, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.2
## 95 percent confidence interval:
##  0.1927918 0.1954932
## sample estimates:
## mean of x 
## 0.1941425
# Interpretation:
# If the p-value is < 0.05, reject H0, indicating the mean liveness significantly differs from 0.2.

# b. Independent Two-Sample T-test: High vs Low Instrumentalness for Duration
# Hypotheses:
# H0: Mean duration_ms for High and Low instrumentalness groups is the same.
# H1: Mean duration_ms differs between groups.
spotify_clean$instrumentalness_group <- ifelse(spotify_clean$instrumentalness > 0.5, "High", "Low")
t_test_two_sample <- t.test(duration_ms ~ instrumentalness_group, data = spotify_clean)
print(t_test_two_sample)
## 
##  Welch Two Sample t-test
## 
## data:  duration_ms by instrumentalness_group
## t = -34.382, df = 12395, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group High and group Low is not equal to 0
## 95 percent confidence interval:
##  -51489.90 -45935.62
## sample estimates:
## mean in group High  mean in group Low 
##           201528.8           250241.6
# Interpretation:
# A p-value < 0.05 suggests a significant difference in track duration between high and low instrumentalness groups.

# c. Paired-Sample T-test: Instrumentalness and Liveness
# Hypotheses:
# H0: No difference in mean between instrumentalness and liveness.
# H1: Difference exists in mean between instrumentalness and liveness.
t_test_paired <- t.test(spotify_clean$instrumentalness, spotify_clean$liveness, paired = TRUE)
print(t_test_paired)
## 
##  Paired t-test
## 
## data:  spotify_clean$instrumentalness and spotify_clean$liveness
## t = -32.706, df = 62316, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.05080020 -0.04505578
## sample estimates:
## mean difference 
##     -0.04792799
# Interpretation:
# A p-value < 0.05 indicates a significant difference between instrumentalness and liveness means.

# d. One-Way ANOVA for Duration_ms Across Keys
anova_one_way <- aov(duration_ms ~ factor(key), data = spotify_clean)
summary(anova_one_way)
##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## factor(key)    12 1.834e+12 1.528e+11   11.99 <2e-16 ***
## Residuals   62304 7.939e+14 1.274e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation:
# A significant p-value indicates that track durations vary significantly across musical keys.

# e. Linear Regression: Liveness as a Predictor of Duration_ms
lm_model <- lm(duration_ms ~ liveness, data = spotify_clean)
summary(lm_model)
## 
## Call:
## lm(formula = duration_ms ~ liveness, data = spotify_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -233992  -50210   -6266   43737 4339096 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 241793.3      682.5 354.254   <2e-16 ***
## liveness      3779.6     2631.3   1.436    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113000 on 62315 degrees of freedom
## Multiple R-squared:  3.311e-05,  Adjusted R-squared:  1.706e-05 
## F-statistic: 2.063 on 1 and 62315 DF,  p-value: 0.1509
# Interpretation:
# Coefficients and p-values indicate whether liveness significantly predicts track duration.

### 3. Data Visualizations ###

# Single-variable: Bar plot of Key
ggplot(spotify_clean, aes(x = factor(key))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Musical Keys", x = "Key (0-11)", y = "Count") +
  theme_minimal()

# Interpretation:
# Displays the frequency of tracks in each musical key. Peaks indicate commonly used keys.

# Bi-variable: Scatter plot of Liveness vs Instrumentalness
ggplot(spotify_clean, aes(x = instrumentalness, y = liveness)) +
  geom_point(color = "blue") +
  labs(title = "Liveness vs Instrumentalness", x = "Instrumentalness", y = "Liveness") +
  theme_minimal()

# Interpretation:
# Visualizes the relationship between liveness and instrumentalness. Patterns or clustering suggest correlations.

# Multi-variable: 3D Scatter Plot of Duration_ms, Liveness, and Instrumentalness
plot_3d <- plot_ly(
  spotify_clean,
  x = ~duration_ms,
  y = ~liveness,
  z = ~instrumentalness,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 3, color = ~instrumentalness, colorscale = "Viridis", opacity = 0.8)
) %>%
  layout(
    title = "3D Scatter Plot of Duration, Liveness, and Instrumentalness",
    scene = list(
      xaxis = list(title = "Duration (ms)"),
      yaxis = list(title = "Liveness"),
      zaxis = list(title = "Instrumentalness")
    )
  )

# Display the 3D plot
plot_3d
x: 209.893ky: −1z: −1
3D Scatter Plot of Duration, Liveness, and Instrumentalness
# Interpretation:
# This interactive 3D plot shows how duration, liveness, and instrumentalness interact simultaneously.

### 4. Conclusion Summary ###

# - Descriptive Statistics:
#   - `Liveness` and `Instrumentalness` show their central tendencies and variability, giving insights into their spread in the dataset.
# - Inferential Statistics:
#   - One-sample t-test: Tested if mean liveness significantly deviates from 0.2.
#   - Two-sample t-test: Compared durations for high and low instrumentalness groups.
#   - Paired t-test: Explored differences between liveness and instrumentalness means.
#   - One-way ANOVA: Analyzed differences in duration_ms across keys.
#   - Linear Regression: Examined how liveness predicts duration.
# - Visualizations:
#   - Bar plot: Showed the frequency of tracks across musical keys.
#   - Scatter plot: Explored the relationship between liveness and instrumentalness.
#   - 3D scatter plot: Combined multiple variables to observe their interactions.
# - Overall Insights:
#   - The analysis highlights key attributes of Spotify tracks, such as the impact of liveness and instrumentalness on track duration and the variability across musical keys. 
#   - These insights can guide music production, recommendations, or further research into music analytics.